{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### IO操作(读取本地数据文件)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "us_file_path = \"./youtube_video_data/US_video_data_numbers.csv\"\n",
    "uk_file_path = './youtube_video_data/GB_video_data_numbers.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t1 = np.loadtxt(us_file_path, delimiter=',')\n",
    "t1"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "科学计数法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t1 = np.loadtxt(us_file_path, delimiter=',', dtype='int')\n",
    "t1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t2 = np.loadtxt(us_file_path, delimiter=',', dtype='int', unpack=True)\n",
    "t2"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "转置"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 索引和切片"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 取行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取行\n",
    "t1[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取连续的多行\n",
    "t1[2:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取不连续的多行\n",
    "t1[1,4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取不连续的多行\n",
    "t1[[1,4]] #第2行和第5行"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 取列\n",
    "\n",
    "#### 取整列\n",
    "\n",
    "如果切片只有冒号，表示选取该方向的整个轴"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取一行\n",
    "t2[1,:] #等价于 t2[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取连续的行\n",
    "t2[2:,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t2[1,0,3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取不连续的行\n",
    "t2[[1,0,3],:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取第一列\n",
    "t2[:,0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取连续的多列\n",
    "t2[:,2:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取不连续的多列\n",
    "t2[:,[2,4]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取值\n",
    "a = t2[2,3]\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取连续行和连续列\n",
    "t2[0:2,2:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取多个不相邻的点\n",
    "t2[[0,2,2],[0,1,3]] #3个点,坐标为: (0,0), (2,1), (2,3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = np.arange(24).reshape((4,6))\n",
    "t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t<10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t[t<10] = 3\n",
    "t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t[t>20]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 三元运算符\n",
    "\n",
    "`where`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = 3 if 3>2 else 4\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = 3 if 3<2 else 4\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = np.arange(24).reshape((4,6))\n",
    "t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.where(t<=3,100,300)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### clp 截尾操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  1.,  2.,  3.,  4.,  5.],\n",
       "       [ 6.,  7.,  8.,  9., 10., 11.],\n",
       "       [12., 13., 14., 15., 16., 17.],\n",
       "       [18., 19., 20., 21., 22., 23.]])"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t3 = np.arange(24,dtype=np.float64).reshape((4,6)) #必须是浮点数类型才可以转换成 np.nan\n",
    "t3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  1.,  2.,  3.,  4.,  5.],\n",
       "       [ 6.,  7.,  8.,  9., 10., 11.],\n",
       "       [12., 13., 14., 15., 16., 17.],\n",
       "       [18., 19., 20., nan, nan, nan]])"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t3[t3>20] = np.nan\n",
    "t3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[10., 10., 10., 10., 10., 10.],\n",
       "       [10., 10., 10., 10., 10., 11.],\n",
       "       [12., 13., 14., 15., 16., 17.],\n",
       "       [18., 18., 18., nan, nan, nan]])"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t3.clip(10,18)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "小于10的替换为10，大于18的替换为了18，但是nan没有被替换，那么nan是什么？"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### numpy 中的 nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  1.,  2.,  3.],\n",
       "       [ 4.,  5.,  6.,  7.],\n",
       "       [ 8.,  9., 10., 11.]], dtype=float32)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t1 = np.arange(12, dtype=np.float32).reshape((3,4))\n",
    "t1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  1.,  2.,  3.],\n",
       "       [ 4.,  5., nan, nan],\n",
       "       [ 8.,  9., 10., 11.]], dtype=float32)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t1[1,2:] = np.nan\n",
    "t1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[False, False, False, False],\n",
       "       [False, False,  True,  True],\n",
       "       [False, False, False, False]])"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.isnan(t1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.,  1.,  2.,  3.,  4.,  5.,  8.,  9., 10., 11.], dtype=float32)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t1[t1 == t1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fill_ndarray(t1):\n",
    "    for i in range(t1.shape[1]): #遍历每一列\n",
    "        temp_col = t1[:,i] #当前的一列\n",
    "        nan_num = np.count_nonzero(temp_col != temp_col)\n",
    "        if nan_num: #不为0, 说明当前列中有nan\n",
    "            temp_not_nan_col = temp_col[temp_col == temp_col] #选出当前列中不为0的元素\n",
    "            temp_col[np.isnan(temp_col)] = temp_not_nan_col.mean() #计算均值\n",
    "    return t1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.  1.  2.  3.]\n",
      " [ 4.  5. nan nan]\n",
      " [ 8.  9. 10. 11.]]\n",
      "[[ 0.  1.  2.  3.]\n",
      " [ 4.  5.  6.  7.]\n",
      " [ 8.  9. 10. 11.]]\n"
     ]
    }
   ],
   "source": [
    "if __name__ == '__main__':\n",
    "    t1 = np.arange(12, dtype=np.float32).reshape((3,4))\n",
    "    t1[1,2:] = np.nan\n",
    "    print(t1)\n",
    "    t1 = fill_ndarray(t1)\n",
    "    print(t1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:analyze]",
   "language": "python",
   "name": "conda-env-analyze-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
